Lesson 4


Scatterplots and Perceived Audience Size

Notes:

getwd()
## [1] "/home/matar/GitHub/Online_Courses/Udacity-ND/P04_Explore_And _Summarize_Data/Lessons/Explore_Two_Variable"
list.files()
## [1] "lesson4_student_files" "lesson4_student.rmd"   "pseudo_facebook.tsv"
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')

names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"

Scatterplots

Notes: - examine relationship btw 2 continouse variables

qplot(x = age, y = friend_count, data = pf)

# alternative (ggplot1)

What are some things that you notice right away?

Response:


ggplot Syntax

Notes:

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00
ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point() + # aes() is wrapper
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).


Overplotting

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20) + # aes() is wrapper
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_jitter(color = 'red', alpha = 1/20) + # aes() is wrapper
  xlim(13,90)
## Warning: Removed 5188 rows containing missing values (geom_point).

What do you notice in the plot?

Response:


Coord_trans()

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20) +
  xlim(13,90) +
  coord_trans(y = "sqrt") 
## Warning: Removed 4906 rows containing missing values (geom_point).

Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

What do you notice?


Alpha and Jitter

Notes:

# ggplot(aes(x = age, y = friendships_initiated), data = pf) + 
#   geom_point(alpha = 1/20, position = 'jitter')  +
#   coord_trans(y = 'sqrt')
## above code will produce error cuz we have casues where y is zero !

ggplot(aes(x = age, y = friendships_initiated), data = pf) + 
  geom_point(alpha = 1/20, position = position_jitter(h = 0))  +
  coord_trans(y = 'sqrt')


Overplotting and Domain Knowledge

Notes:


Conditional Means

Notes:

#install.packages('dplyr')
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pf.fc_by_age <- pf %>%
  group_by(age) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %>%
  arrange(age)

head(pf.fc_by_age, 20)
## # A tibble: 20 × 4
##      age friend_count_mean friend_count_median     n
##    <int>             <dbl>               <dbl> <int>
## 1     13          164.7500                74.0   484
## 2     14          251.3901               132.0  1925
## 3     15          347.6921               161.0  2618
## 4     16          351.9371               171.5  3086
## 5     17          350.3006               156.0  3283
## 6     18          331.1663               162.0  5196
## 7     19          333.6921               157.0  4391
## 8     20          283.4991               135.0  3769
## 9     21          235.9412               121.0  3671
## 10    22          211.3948               106.0  3032
## 11    23          202.8426                93.0  4404
## 12    24          185.7121                92.0  2827
## 13    25          131.0211                62.0  3641
## 14    26          144.0082                75.0  2815
## 15    27          134.1473                72.0  2240
## 16    28          125.8354                66.0  2364
## 17    29          120.8182                66.0  1936
## 18    30          115.2080                67.5  1716
## 19    31          118.4599                63.0  1694
## 20    32          114.2800                63.0  1443

Create your plot!

ggplot(aes(x = age, y = friend_count_mean ), data = pf.fc_by_age) +
  geom_line() #geom_point()


Overlaying Summaries with Raw Data

Notes:

ggplot(aes(x = age, y = friendships_initiated), data = pf) + 
    coord_cartesian(xlim = c(13, 90), ylim = c(0, 1000)) +
    geom_point(alpha = 0.05, 
               position = position_jitter(h = 0),
               color = 'Orange')  +
  geom_line(stat = 'summary' , fun.y = mean)+
    geom_line(stat = 'summary' , fun.y = quantile, fun.args = list(probs = 0.1), linetype = 2, color = 'blue') +
  geom_line(stat = 'summary' , fun.y = quantile, fun.args = list(probs = 0.5), linetype = 2, color = 'blue') +
  geom_line(stat = 'summary' , fun.y = quantile, fun.args = list(probs = 0.9), linetype = 2, color = 'blue')

What are some of your observations of the plot?

Response:


Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.

Notes:


Correlation

Notes:

cor.test(pf$age, pf$friend_count, method = 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
#alternative
with(pf, cor.test(age, friend_count, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response:


Correlation on Subsets

Notes:

with(  subset(pf, age <= 70) , cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

Correlation Methods

Notes: Pearson product-memont correlation measures the strength of relationship between any 2 variables


Create Scatterplots

Notes:

ggplot(aes(x = www_likes_received, y = likes_received), data = pf) + 
    geom_point()


Strong Correlations

Notes: The correlation coefficient is invariant under a linear transformation of either X or Y, and the slope of the regression line when both X and Y have been transformed to z-scores is the correlation coefficient.

ggplot(aes(x = www_likes_received, y = likes_received), data = pf) + 
  geom_point() +
  xlim(0, quantile(pf$www_likes_received, 0.95)) +
  ylim(0, quantile(pf$www_likes_received, 0.95)) +
  geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 10904 rows containing non-finite values (stat_smooth).
## Warning: Removed 10904 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing missing values (geom_smooth).

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

# correlation coefficent
cor.test(pf$www_likes_received, pf$likes_received)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

Response:


Moira on Correlation

Notes:


More Caution with Correlation

Notes: correlation could help us to decide which variables are related

#install.packages('alr3')
library(alr3)
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data(Mitchell)
?Mitchell

Create your plot!

#head(Mitchell, 50)

ggplot(aes(x = Month, y = Temp), data = Mitchell) +
  geom_point()


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.

  2. What is the actual correlation of the two variables? (Round to the thousandths place)

cor.test(Mitchell$Month, Mitchell$Temp)
## 
##  Pearson's product-moment correlation
## 
## data:  Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063
#0.057 very weak correlation

Making Sense of Data

Notes:

#range(Mitchell$Month)
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
  geom_point() +
  scale_x_continuous(breaks = seq(0, 203, 12)) # to make month discreet (every 12 months) from 0 to 203, break every 12


A New Perspective

What do you notice? Response:

Watch the solution video and check out the Instructor Notes! Notes:


Understanding Noise: Age to Age Months

Notes:

# add new column
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12) 

Age with Months Means

pf.fc_by_age_months <- pf %>%
  group_by(age_with_months) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %>%
  arrange(age_with_months)

head(pf.fc_by_age_months)
## # A tibble: 6 × 4
##   age_with_months friend_count_mean friend_count_median     n
##             <dbl>             <dbl>               <dbl> <int>
## 1        13.16667          46.33333                30.5     6
## 2        13.25000         115.07143                23.5    14
## 3        13.33333         136.20000                44.0    25
## 4        13.41667         164.24242                72.0    33
## 5        13.50000         131.17778                66.0    45
## 6        13.58333         156.81481                64.0    54

Programming Assignment


Noise in Conditional Means

ggplot(aes(x = age_with_months, y = friend_count_mean ), 
       data = subset(pf.fc_by_age_months, age_with_months < 71 )) +
  geom_line() #geom_point()


Smoothing Conditional Means

Notes:

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
p1 <- ggplot(aes(x = age, y = friend_count_mean ), 
             data = subset(pf.fc_by_age, age < 70)) +
  geom_line() +
  geom_smooth()

p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean ), 
       data = subset(pf.fc_by_age_months, age_with_months < 71 )) +
  geom_line() +
  geom_smooth()


grid.arrange(p2, p1, ncol = 1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'


Which Plot to Choose?

Notes:


Analyzing Two Variables

Reflection: - scatter plots - conditional summaries (liek : means) - correlation coefficient


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!